## ---- Setup ----
## lmer estimation algorithm control
strict_tol <- lmerControl(optCtrl = list(xtol_abs = 1e-8,
                                         ftol_abs = 1e-8))

## ---- Data ----
## SOEP data
load("dat/proc-data/soep_imp.RData")

## ---- Shortened TSCS: AfD analyses, 2014-2018 (demeaning) ----
## Variable selection for demeaning
current_dm_vars <- soep_imp$imputations$imp1 %>%
  dplyr::select(dplyr::ends_with("_cwu")) %>%
  names() %>%
  substr(., 1, nchar(.) - 4)
additional_dm_vars <- c("nbh_met_p_deutschl",
                        "nbh_alq_p_quote",
                        "kr_uemprate",
                        "kr_foreigner",
                        "social_housing")
dm_vars <- c(current_dm_vars, additional_dm_vars)

## Updated regional data
regional_data <- rio::import("../../soep-data/soep.v38/stata/regionl.dta") %>%
  dplyr::filter(kkz > 0) %>%
  dplyr::select(kkz,
                syear,
                kr_uemprate,
                kr_foreigner) %>%
  dplyr::rename(year = syear) %>%
  dplyr::mutate_at(
    .vars = vars(kr_uemprate,
                 kr_foreigner),
    .funs = ~ ifelse(. < 0, NA_real_, .)
  ) %>%
  dplyr::distinct() %>%
  dplyr::group_by(kkz, year) %>%
  dplyr::summarize_all(mean) %>%
  dplyr::ungroup()

## Data management
for (m in seq_along(soep_imp$imputations)) {
  soep_imp$imputations[[m]] <- soep_imp$imputations[[m]] %>%
    dplyr::mutate(
      yrs_at_current_address = year - hh_at_current_address,
      age_at_current_address = age - yrs_at_current_address
    ) %>%
    dplyr::mutate(
      ltr_since_3 = as.numeric(yrs_at_current_address >= 3),
      ltr_since_5 = as.numeric(yrs_at_current_address >= 5),
      ltr_since_7 = as.numeric(yrs_at_current_address >= 7),
      ltr_age_30 = as.numeric(age_at_current_address <= 30)
    ) %>%
    dplyr::mutate(lw = as.numeric(lef == 1 | spd == 1 | gre == 1),
                  rw = as.numeric(cdu == 1 |
                                    fdp == 1 | afd == 1)) %>%
    filter(renter_no_rent == 0) %>%
    dplyr::select(
      id,
      hh_id,
      plz,
      kkz,
      year,
      year_fac,
      weight,
      starts_with("cold_rent_load"),
      starts_with("cold_rent_sqm"),
      wr_ecego,
      wr_immig,
      afd,
      vote_afd,
      gre,
      lef,
      fdp,
      cdu,
      spd,
      non,
      lw,
      rw,
      owner,
      all_of(dm_vars),
      starts_with("rent"),
      starts_with("cmr_arm"),
      starts_with("log_hinc_eq"),
      starts_with("gtyp"),
      starts_with("prop_personal_hinc"),
      starts_with("hh_prop_ecact"),
      starts_with("hh_mmb"),
      starts_with("mover"),
      starts_with("lm_part"),
      starts_with("edu"),
      starts_with("age"),
      starts_with("east"),
      starts_with("fem"),
      starts_with("east"),
      starts_with("ltr_"),
      social_housing
    ) %>%
    dplyr::filter(not(is.na(rent_umn))) %>%
    dplyr::filter(not(is.na(rent_cwu))) %>%
    dplyr::filter(not(is.na(cmr_arm_umn))) %>%
    dplyr::filter(not(is.na(cmr_arm_cwu)))
}

## Local market data
load("dat/proc-data/plz5_f_und_b.RData")
plz5_f_und_b <- plz5_f_und_b %>%
  st_set_geometry(NULL) %>%
  filter(year %in% 2005:2018) %>%
  dplyr::select(plz, year, cmr_arm, rent) %>%
  distinct() %>%
  group_by(plz) %>%
  mutate(
    cmr_arm_2005 = mean(cmr_arm[year %in% 2005:2007], na.rm = TRUE),
    cmr_arm_2018 = mean(cmr_arm[year %in% 2016:2018], na.rm = TRUE),
    cmr_arm_chg = cmr_arm_2018 - cmr_arm_2005
  ) %>%
  mutate(
    rent_2005 = mean(rent[year %in% 2005:2007], na.rm = TRUE),
    rent_2018 = mean(rent[year %in% 2016:2018], na.rm = TRUE),
    rent_chg = rent_2018 - rent_2005,
    cmr_arm_chg = ifelse(is.na(cmr_arm_chg), rent_chg, cmr_arm_chg),
    cmr_arm = ifelse(is.na(cmr_arm), rent, cmr_arm)
  ) %>%
  ungroup() %>%
  group_by(year) %>%
  mutate(
    rent_cat =
      ifelse(
        year == 2004,
        NA_integer_,
        cut(
          rent,
          breaks = quantile(rent, c(0, 1 / 3, 2 / 3, 1), na.rm = TRUE),
          labels = paste0("T", 1:3)
        )
      ),
    cmr_arm_cat =
      cut(
        cmr_arm,
        breaks = quantile(cmr_arm, c(0, 1 / 3, 2 / 3, 1), na.rm = TRUE),
        labels = paste0("T", 1:3)
      )
  ) %>%
  ungroup() %>%
  dplyr::select(plz, cmr_arm_chg, rent_chg, rent_cat, cmr_arm_cat, year) %>%
  distinct() %>%
  na.omit() %>%
  mutate(
    cmr_arm_chg_cat =
      cut(
        cmr_arm_chg,
        breaks = quantile(cmr_arm_chg, c(0, 1 / 3, 2 / 3, 1)),
        labels = paste0("T", 1:3)
      ),
    rent_chg_cat =
      cut(
        rent_chg,
        breaks = quantile(rent_chg, c(0, 1 / 3, 2 / 3, 1)),
        labels = paste0("T", 1:3)
      )
  )

## Additional data management
for (m in seq_along(soep_imp$imputations)) {
  soep_imp$imputations[[m]] <- soep_imp$imputations[[m]] %>%
    # top code household rents per sqm
    mutate(cold_rent_sqm = case_when(cold_rent_sqm > 40 ~ 40,
                                     TRUE ~ cold_rent_sqm)) %>%
    group_by(year) %>%
    mutate(log_hinc_eq_cat =
             cut(
               log_hinc_eq,
               breaks = quantile(log_hinc_eq, c(0, 1 / 3, 2 / 3, 1)),
               labels = paste0("T", 1:3)
             )) %>%
    ungroup() %>%
    mutate(
      gtyp4 = case_when(
        gtyp %in% c(
          "[1] Kernstaedte>==gr.Verdraum",
          "[2] Kernstaedte<==gr.Verdraum",
          "[9] Kernstaedte<==Verdansatz"
        ) ~ "urban",
        gtyp %in% c(
          "[7] Mi-zent.=laendl=gr.Verdraum",
          "[10] Mi-zent.=verd.=Verdansatz",
          "[12] Mi-zent.=laendl=Verdansatz",
          "[14] Mi-zent.=verd.=laendl.",
          "[16] Mi-zent.=laendl=laendl."
        ) ~ "towns",
        gtyp %in% c(
          "[3] Mi-zent.=hochverd.=gr.Verdraum",
          "[5] Mi-zent.=verd.=gr.Verdraum",
          "[4] so.Gem.=hochverd.=gr.Verdraum",
          "[6] so.Gem.=verd.=gr.Verdraum"
        ) ~ "suburban",
        gtyp %in% c(
          "[8] so.Gem.=laendl=gr.Verdraum",
          "[11] so.Gem.=verd.=Verdansatz",
          "[13] so.Gem.=laendl=Verdansatz",
          "[15] so.Gem.=verd.=laendl.",
          "[17] so.Gem.=laendl=laendl."
        ) ~ "rural",
        TRUE ~ NA_character_
      ),
      gtyp3 = ifelse(gtyp4 == "towns", "rural", gtyp4)
    ) %>%
    left_join(plz5_f_und_b,
              by = c("plz", "year")) %>%
    dplyr::filter(weight > 0)
  
  ## Repair names
  names(soep_imp$imputations[[m]]) <-
    gsub(" ", "_", names(soep_imp$imputations[[m]]))
}

## Join county-level data, within-demeaning
for (m in seq_along(soep_imp$imputations)) {
  soep_imp$imputations[[m]] <- soep_imp$imputations[[m]] %>%
    dplyr::select(-dplyr::starts_with("kr_")) %>%
    dplyr::left_join(regional_data,
                     by = c("kkz", "year")) %>%
    dplyr::select(-ends_with("_cwu"),-ends_with("_cwu"))
}


## 2014:2018 Data for AfD support analyses
dm_vars <- gsub(" ", "_", dm_vars)
soep_afd <- soep_imp
soep_afd$imputations <-
  lapply(soep_afd$imputations, function (imp) {
    imp %>%
      dplyr::filter(year >= 2014) %>%
      dplyr::select(-dplyr::ends_with("_cwu")) %>%
      dplyr::select(-dplyr::ends_with("_umn")) %>%
      group_by(id) %>%
      mutate_at(.vars = vars(all_of(dm_vars)),
                .funs = list(
                  umn = ~ mean(., na.rm = T),
                  cwu = ~ . - mean(., na.rm = T)
                )) %>%
      ungroup()
  })

## ---- Get descriptives ----
vars <- c(
  "cmr_arm",
  "cold_rent_sqm",
  "wr_ecego",
  "afd",
  "vote_afd",
  "gre",
  "fdp",
  "spd",
  "cdu",
  "lef",
  "non",
  "lw",
  "rw",
  "social_housing",
  "cmr_arm",
  "cold_rent_sqm",
  "log_hinc_eq",
  "log_hinc_eq_cat",
  "social_housing_cwu",
  "social_housing_umn",
  "cmr_arm_cwu",
  "cmr_arm_umn",
  "cold_rent_sqm_cwu",
  "cold_rent_sqm_umn",
  "log_hinc_eq_cwu",
  "log_hinc_eq_umn",
  "prop_personal_hinc_cwu",
  "prop_personal_hinc_umn",
  "hh_prop_ecact_cwu",
  "hh_prop_ecact_umn",
  "hh_mmb_cwu",
  "hh_mmb_umn",
  "lm_part_Atypical_umn",
  "lm_part_Inactive_umn",
  "lm_part_Unemployed_umn",
  "lm_part_In_Education_umn",
  "lm_part_Pensioners_umn",
  "lm_part_Atypical_cwu",
  "lm_part_Inactive_cwu",
  "lm_part_Unemployed_cwu",
  "lm_part_In_Education_cwu",
  "lm_part_Pensioners_cwu",
  "age",
  "east",
  "fem",
  "edu5",
  "gtyp3",
  "year",
  "kr_foreigner_cwu",
  "kr_uemprate_cwu",
  "kr_foreigner_umn",
  "kr_uemprate_umn",
  "nbh_met_p_deutschl_cwu",
  "nbh_alq_p_quote_cwu",
  "nbh_met_p_deutschl_umn",
  "nbh_alq_p_quote_umn"
)

cat_vars <- c("log_hinc_eq_cat",
              "edu5",
              "gtyp3")

subset_ltr <- c(NA_character_,
                "ltr_since_3 == 1",
                "ltr_since_5 == 1")

subset_cmr <- c(NA_character_)

subset_characteristics <- tibble()

## Capture start time
analysis <- 1
ptm <- proc.time()[3]

## Start loop
for (s in subset_ltr) {
  subset_tmp <- lapply(soep_afd$imputations,
                       function (dat) {
                         if (is.na(s)) {
                           dat$subset_ltr <- 1
                         } else {
                           ## Create subset
                           command <- paste0("dat$subset_ltr <- ",
                                             "as.numeric(dat$",
                                             s,
                                             ")")
                           eval(parse(text = command))
                         }

                         return(dat)
                       })
  
  for (t in subset_cmr) {
    subset_tmp <- lapply(subset_tmp,
                         function (dat) {
                           if (is.na(t)) {
                             dat$subset_cmr <- 1
                           } else {
                             ## Create subset
                             command <- paste0("dat$subset_cmr <- ",
                                               "as.numeric(dat$",
                                               t,
                                               ")")
                             eval(parse(text = command))
                           }
                           
                           ## Select
                           dat %>%
                             dplyr::select(owner,
                                           subset_ltr,
                                           subset_cmr,
                                           weight,
                                           id,
                                           all_of(vars)) %>%
                             dplyr::mutate_at(
                               .vars = vars(subset_ltr, subset_cmr, owner, gtyp3, log_hinc_eq_cat),
                               .funs = ~ as.factor(.)
                             )
                           return(dat)
                         })
    
    
    for (x in vars) {
      if (is.na(t)) {
        if (is.na(s) & x %in% c("cold_rent_sqm",
                                "cold_rent_sqm_cwu",
                                "cold_rent_sqm_umn")) {
          at_list <- NULL
          formula_local <- "1"
          join_vars <- c("at(subset_ltr)" = "subset_ltr",
                         "at(subset_cmr)" = "subset_cmr")
        } else if (is.na(s) & !(x %in% c("cold_rent_sqm",
                                         "cold_rent_sqm_cwu",
                                         "cold_rent_sqm_umn"))) {
          at_list <- list(owner = c(0, 1))
          formula_local <- "owner"
          join_vars <- c(
            "at(subset_ltr)" = "subset_ltr",
            "at(subset_cmr)" = "subset_cmr",
            "at(owner)" = "owner"
          )
        } else if (!is.na(s) & x %in% c("cold_rent_sqm",
                                        "cold_rent_sqm_cwu",
                                        "cold_rent_sqm_umn")) {
          at_list <- list(subset_ltr = c(0, 1))
          formula_local <- "subset_ltr"
          join_vars <- c("at(subset_ltr)" = "subset_ltr",
                         "at(subset_cmr)" = "subset_cmr")
        } else {
          at_list <- list(subset_ltr = c(0, 1),
                          owner = c(0, 1))
          formula_local <- "subset_ltr * owner"
          join_vars <- c(
            "at(subset_ltr)" = "subset_ltr",
            "at(subset_cmr)" = "subset_cmr",
            "at(owner)" = "owner"
          )
        }
      } else {
        if (is.na(s) & x %in% c("cold_rent_sqm",
                                "cold_rent_sqm_cwu",
                                "cold_rent_sqm_umn")) {
          at_list <- list(subset_cmr = c(0, 1))
          formula_local <- "subset_cmr"
          join_vars <- c("at(subset_ltr)" = "subset_ltr",
                         "at(subset_cmr)" = "subset_cmr")
        } else if (is.na(s) & !(x %in% c("cold_rent_sqm",
                                         "cold_rent_sqm_cwu",
                                         "cold_rent_sqm_umn"))) {
          at_list <- list(owner = c(0, 1),
                          subset_cmr = c(0, 1))
          formula_local <- "subset_cmr * owner"
          join_vars <- c(
            "at(subset_ltr)" = "subset_ltr",
            "at(subset_cmr)" = "subset_cmr",
            "at(owner)" = "owner"
          )
        } else if (!is.na(s) & x %in% c("cold_rent_sqm",
                                        "cold_rent_sqm_cwu",
                                        "cold_rent_sqm_umn")) {
          at_list <- list(subset_ltr = c(0, 1),
                          subset_cmr = c(0, 1))
          formula_local <- "subset_cmr * subset_ltr"
          join_vars <- c("at(subset_ltr)" = "subset_ltr",
                         "at(subset_cmr)" = "subset_cmr")
        } else {
          at_list <- list(subset_ltr = c(0, 1),
                          owner = c(0, 1),
                          subset_cmr = c(0, 1))
          formula_local <- "subset_cmr * subset_ltr * owner"
          join_vars <- c(
            "at(subset_ltr)" = "subset_ltr",
            "at(subset_cmr)" = "subset_cmr",
            "at(owner)" = "owner"
          )
        }
      }
      
      data_tmp <- lapply(subset_tmp,
                         function (dat) {
                           dat %>%
                             dplyr::select(owner,
                                           subset_ltr,
                                           subset_cmr,
                                           weight,
                                           id, !!as.name(x)) %>%
                             na.omit()
                         })
      
      if (not(is.numeric(data_tmp[[1]][[x]]))) {
        data_tmp <- lapply(data_tmp, function (dat) {
          bind_cols(
            dat %>%
              dplyr::select(-any_of(x)),
            dat %>%
              to_dummy(.,!!as.name(x), suffix = "label")
          ) %>%
            rename_all(make.names)
        })
        levels_x <- names(data_tmp[[1]])[5:ncol(data_tmp[[1]])]
        for (k in levels_x) {
          formula_tmp <-
            as.formula(paste(k, formula_local, sep = " ~ "))
          
          out_tmp <- lapply(data_tmp, function (dat) {
            mod_tmp <- lm(formula_tmp,
                          data = dat,
                          weights = weight)
            
            pred_tmp <- prediction::prediction(
              mod_tmp,
              at = at_list,
              vcov = sandwich::vcovCL(mod_tmp, cluster = dat$id)
            ) %>%
              summary()
            
            sum_tmp <- dat %>%
              group_by(subset_ltr, subset_cmr, owner) %>%
              summarize(n = n(),
                        sd = sd(!!as.name(k))) %>%
              ungroup()
            
            join_vars_tmp <-
              join_vars[names(join_vars) %in% names(pred_tmp)]
            
            pred_tmp <- pred_tmp %>%
              left_join(sum_tmp,
                        by = join_vars_tmp)
            
            return(pred_tmp)
          }) %>%
            pool_pred_summary() %>%
            mutate(Outcome = k,
                   Subset = s,
                   `Market Segment` = t)
          
          subset_characteristics <- bind_rows(subset_characteristics,
                                              out_tmp)
        }
      } else {
        formula_tmp <- as.formula(paste(x, formula_local, sep = " ~ "))
        
        out_tmp <- lapply(data_tmp, function (dat) {
          mod_tmp <- lm(formula_tmp,
                        data = dat,
                        weights = weight)
          
          pred_tmp <- prediction::prediction(
            mod_tmp,
            at = at_list,
            vcov = sandwich::vcovCL(mod_tmp, cluster = dat$id)
          ) %>%
            summary()
          
          sum_tmp <- dat %>%
            group_by(subset_ltr, subset_cmr, owner) %>%
            summarize(n = n(),
                      sd = sd(!!as.name(x))) %>%
            ungroup()
          
          join_vars_tmp <-
            join_vars[names(join_vars) %in% names(pred_tmp)]
          
          pred_tmp <- pred_tmp %>%
            left_join(sum_tmp,
                      by = join_vars_tmp)
          
          return(pred_tmp)
        }) %>%
          pool_pred_summary() %>%
          mutate(Outcome = x,
                 Subset = s,
                 `Market Segment` = t)
        
        subset_characteristics <- bind_rows(subset_characteristics,
                                            out_tmp)
      }
      ## Progress
      print(paste0(
        "Analysis: ",
        analysis,
        ". Time elapsed: ",
        round((proc.time()[3] - ptm) / 60, 2),
        " minutes."
      ))
      analysis <- analysis + 1L
    }
  }
}

subset_characteristics <- subset_characteristics %>%
  dplyr::mutate(
    subset_ltr = ifelse(is.na(subset_ltr), `at(subset_ltr)`, subset_ltr),
    owner = ifelse(is.na(owner), `at(owner)`, owner)
  ) %>%
  dplyr::select(-`at(subset_ltr)`,-`at(owner)`)

subset_characteristics %>%
  write.csv(paste0("est/", "subset_characteristics.csv"))
